Here’s the task: https://preppindata.blogspot.com/2021/01/2021-week-1.html. Essentially:
library(stringr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(googlesheets4)
library(lubridate)
library(plotly)
library(scales)
library(gt)
library(wesanderson)
Get data:
sheet <- "1GYv4573GnJa-C21NYeDj-OhFSTwrK0SnQNF2IQFqa50"
gs4_deauth()
data <- read_sheet(sheet)
data %>% head(10) %>% gt()
| Order ID | Customer Age | Bike Value | Existing Customer? | Date | Store - Bike |
|---|---|---|---|---|---|
| 1 | 22 | 481 | No | 2021-04-25 | York - Road |
| 2 | 28 | 1825 | No | 2021-01-23 | York - Road |
| 3 | 51 | 1903 | No | 2021-07-03 | York - Rood |
| 4 | 59 | 1059 | No | 2021-01-24 | York - Road |
| 5 | 44 | 1764 | Yes | 2021-08-12 | York - Mountain |
| 6 | 16 | 967 | Yes | 2021-08-15 | London - Mountain |
| 7 | 35 | 1575 | Yes | 2021-03-13 | London - Mountain |
| 8 | 50 | 1074 | No | 2021-09-22 | London - Mountain |
| 9 | 37 | 1977 | No | 2021-02-09 | London - Gravel |
| 10 | 55 | 1352 | No | 2021-11-24 | Leeds - Gravel |
Tidy it:
tidy_data <- data %>%
rename_with(~ str_remove(., "\\?|- ") %>%
str_replace_all(., " ", "_") %>%
str_to_lower(.)) %>%
separate(store_bike,
into = c("store", "bike")) %>%
mutate(bike = case_when(bike %in% c("Road", "Rood", "Rowd") ~ "Road",
bike %in% c("Mountain", "Mountaen") ~ "Mountain",
bike %in% c("Graval", "Gravel", "Gravle") ~ "Gravel"),
date = as.Date(date)) %>%
filter(!order_id %in% 1:10)
tidy_data %>% head(10) %>% gt()
| order_id | customer_age | bike_value | existing_customer | date | store | bike |
|---|---|---|---|---|---|---|
| 11 | 57 | 902 | No | 2021-10-04 | Birmingham | Road |
| 12 | 31 | 946 | Yes | 2021-01-17 | Leeds | Road |
| 13 | 17 | 1296 | Yes | 2021-10-25 | Birmingham | Road |
| 14 | 59 | 1166 | Yes | 2021-07-18 | Manchester | Road |
| 15 | 24 | 1781 | No | 2021-10-10 | Manchester | Mountain |
| 16 | 59 | 1074 | No | 2021-10-06 | York | Mountain |
| 17 | 57 | 1188 | No | 2021-09-14 | York | Mountain |
| 18 | 56 | 544 | No | 2021-11-23 | York | Mountain |
| 19 | 34 | 579 | Yes | 2021-11-24 | York | Gravel |
| 20 | 17 | 1021 | Yes | 2021-06-24 | York | Gravel |
Add the missing dates the the missing zeros …
complete_data <- tidy_data %>% select(date, bike, bike_value) %>%
complete(date = seq.Date(min(date), max(date), by = "day"),
bike = bike) %>%
replace_na(list(bike_value = 0))
complete_data %>% head(10) %>% gt()
| date | bike | bike_value |
|---|---|---|
| 2021-01-05 | Gravel | 0 |
| 2021-01-05 | Mountain | 0 |
| 2021-01-05 | Road | 2898 |
| 2021-01-06 | Gravel | 0 |
| 2021-01-06 | Mountain | 0 |
| 2021-01-06 | Road | 0 |
| 2021-01-07 | Gravel | 0 |
| 2021-01-07 | Mountain | 3450 |
| 2021-01-07 | Road | 0 |
| 2021-01-08 | Gravel | 1132 |
Summarise average sales per day of the month …
summarised_data <- complete_data %>%
mutate(quarter = paste0("Quarter ", quarter(date)),
day_of_month = mday(date)) %>%
group_by(quarter, day_of_month, bike) %>%
summarise(average_sales_value = mean(bike_value)) %>%
ungroup() %>%
arrange(day_of_month) %>%
group_by(quarter, bike) %>%
mutate(typical_cumulative_monthly_sales = cumsum(average_sales_value)) %>%
ungroup()
## `summarise()` regrouping output by 'quarter', 'day_of_month' (override with `.groups` argument)
summarised_data %>% arrange(quarter, bike, day_of_month) %>% head(10) %>% gt()
| quarter | day_of_month | bike | average_sales_value | typical_cumulative_monthly_sales |
|---|---|---|---|---|
| Quarter 1 | 1 | Gravel | 994.0000 | 994.000 |
| Quarter 1 | 2 | Gravel | 2461.4000 | 3455.400 |
| Quarter 1 | 3 | Gravel | 661.6667 | 4117.067 |
| Quarter 1 | 4 | Gravel | 556.5000 | 4673.567 |
| Quarter 1 | 5 | Gravel | 0.0000 | 4673.567 |
| Quarter 1 | 6 | Gravel | 601.2500 | 5274.817 |
| Quarter 1 | 7 | Gravel | 1678.5000 | 6953.317 |
| Quarter 1 | 8 | Gravel | 1705.0000 | 8658.317 |
| Quarter 1 | 9 | Gravel | 568.0000 | 9226.317 |
| Quarter 1 | 10 | Gravel | 499.6667 | 9725.983 |
Plot …
palette <- wes_palette("GrandBudapest1", n = 3)
plot <- ggplot(data = summarised_data, aes(x = day_of_month, y = typical_cumulative_monthly_sales)) +
geom_line(aes(col = bike),
lwd = 1.2, alpha = 0.7) +
scale_colour_manual(values = palette) +
scale_y_continuous(labels = unit_format(unit = "K", scale = 1e-3)) +
labs(title = "The profile of monthly bike sales",
x = "Day of the month",
y = "Average running total of sales (£)",
col = "Bike type") +
facet_wrap(~ quarter, ncol = 1, strip.position = "left") +
theme_light() +
theme(strip.text.y.left = element_text(angle = 0))
plot
Interactive plot (after a bit of plotly gymnastics) …
int_plot <- plot %>% ggplotly()
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
str(int_plot[["x"]][['layout']][['annotations']][[2]])
## List of 13
## $ text : chr "Average running total of sales (£)"
## $ x : num -0.0134
## $ y : num 0.5
## $ showarrow : logi FALSE
## $ ax : num 0
## $ ay : num 0
## $ font :List of 3
## ..$ color : chr "rgba(0,0,0,1)"
## ..$ family: chr ""
## ..$ size : num 14.6
## $ xref : chr "paper"
## $ yref : chr "paper"
## $ textangle : num -90
## $ xanchor : chr "right"
## $ yanchor : chr "center"
## $ annotationType: chr "axis"
int_plot[["x"]][['layout']][['annotations']][[2]][["x"]] <- -0.05
int_plot